{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "private_outputs": true,
   "provenance": []
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "view-in-github"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/mikexcohen/Statistics_book/blob/main/stats_ch08_probability.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Modern statistics: Intuition, Math, Python, R\n",
    "## Mike X Cohen (sincxpress.com)\n",
    "### https://www.amazon.com/dp/B0CQRGWGLY\n",
    "#### Code for chapter 8\n",
    "\n",
    "---\n",
    "\n",
    "# About this code file:\n",
    "\n",
    "### This notebook will reproduce most of the figures in this chapter (some figures were made in Inkscape), and illustrate the statistical concepts explained in the text. The point of providing the code is not just for you to recreate the figures, but for you to modify, adapt, explore, and experiment with the code.\n",
    "\n",
    "### Solutions to all exercises are at the bottom of the notebook.\n",
    "\n",
    "#### This code was written in google-colab. The notebook may require some modifications if you use a different IDE."
   ],
   "metadata": {
    "id": "mzfXc9E3Xq7k"
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "dFT1qwVEyxTW"
   },
   "outputs": [],
   "source": [
    "# import libraries and define global settings\n",
    "import numpy as np\n",
    "import scipy.stats as stats\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# define global figure properties used for publication\n",
    "import matplotlib_inline.backend_inline\n",
    "matplotlib_inline.backend_inline.set_matplotlib_formats('svg') # display figures in vector format\n",
    "plt.rcParams.update({'font.size':14,             # font size\n",
    "                     'savefig.dpi':300,          # output resolution\n",
    "                     'axes.titlelocation':'left',# title location\n",
    "                     'axes.spines.right':False,  # remove axis bounding box\n",
    "                     'axes.spines.top':False,    # remove axis bounding box\n",
    "                     })"
   ]
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "IpajvgPyTI22"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 8.1: Pie charts for margin figure"
   ],
   "metadata": {
    "id": "No4ZIfb7-HNM"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# a bit of code to show how to get equal areas for some number of pie slices\n",
    "k = 2\n",
    "np.tile(1/k,k)"
   ],
   "metadata": {
    "id": "GAbOT50y-4ix"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "_,axs = plt.subplots(2,1,figsize=(4,6))\n",
    "\n",
    "for a,k in zip(axs,[2,6]):\n",
    "\n",
    "  # draw the pie (and export the patches and text to update the color)\n",
    "  patches,_,autotexts = a.pie(np.tile(1/k,k),autopct='%.1f%%',wedgeprops={'edgecolor':'k'},\n",
    "        colors=np.linspace((.2,.2,.2),(1,1,1),k))\n",
    "\n",
    "  for autotext, patch in zip(autotexts,patches):\n",
    "    inverse_color = 1 - np.array(patch.get_facecolor())\n",
    "    inverse_color[-1] = 1 # invert the color, but not the alpha\n",
    "    autotext.set_color(inverse_color)\n",
    "\n",
    "axs[0].set_title('Coin flip',y=.9)\n",
    "axs[1].set_title('Die roll',y=.9)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('prob_probsInPies.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "jMP_iibr-JdP"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "szGIPfY7-HJm"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 8.4: Visualizing probability masses and densities"
   ],
   "metadata": {
    "id": "p183ZeO4-HD0"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# categorical probability data\n",
    "categoryLabels = [ 'SUV','Convert.','Sports','Minivan','Coupe' ]\n",
    "categoryData = np.random.randint(low=5,high=30,size=len(categoryLabels)).astype(np.float64)\n",
    "categoryData /= np.sum(categoryData)\n",
    "\n",
    "# discrete numerical probability data\n",
    "empiricalIQ = np.random.normal(loc=100,scale=15,size=100)\n",
    "\n",
    "# continuous (analytic) probability data\n",
    "x = np.linspace(-4,4,101)\n",
    "continuousData = stats.norm.pdf(x)*15 + 100\n",
    "\n",
    "\n",
    "### visualize!\n",
    "_,axs = plt.subplots(1,3,figsize=(10,3))\n",
    "\n",
    "# categorical data in bars\n",
    "axs[0].bar(categoryLabels,categoryData,color=[.8,.8,.8],edgecolor='k')\n",
    "axs[0].set_title(r'$\\bf{A}$)  pmf of car types')\n",
    "axs[0].set(ylabel='Probability',yticks=[])\n",
    "axs[0].tick_params(axis='x',rotation=45)\n",
    "\n",
    "# empirical probability data that estimate a density, still in bars\n",
    "axs[1].hist(empiricalIQ,bins=15,color=[.8,.8,.8],edgecolor='k')\n",
    "axs[1].set(xlabel='IQ',ylabel='Probability',yticks=[],xlim=[40,160])\n",
    "axs[1].set_title(r'$\\bf{B}$)  pmf of IQ')\n",
    "\n",
    "# analytical probability density as a line\n",
    "axs[2].plot(x*15+100,continuousData,'k')\n",
    "axs[2].set(xlabel='IQ',ylabel='Probability',yticks=[],xlim=[40,160])\n",
    "axs[2].set_title(r'$\\bf{C}$)  pdf of IQ')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('prob_visualizeMassDensity.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "bm7y9d1qDWDo"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "yHsCqOC3DWHp"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 8.5: Probability mass function of penguin weights"
   ],
   "metadata": {
    "id": "Lwq6Fa4tDWKv"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "penguins = np.arctanh(np.random.uniform(size=473)*1.8-.9)*2+4.5\n",
    "\n",
    "bin_edges = np.arange(np.min(penguins),np.max(penguins),step=.25)\n",
    "\n",
    "plt.figure(figsize=(6,4))\n",
    "plt.hist(penguins,bins=bin_edges,density=True,\n",
    "         color=[.8,.8,.8],edgecolor='k')\n",
    "plt.xlabel('Penguin weight (kg)')\n",
    "plt.ylabel('Probability')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('prob_penguinWeightProb.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "a3fuGlLEQIbn"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "42MKloL-0IpR"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 8.6: pdf and cdf"
   ],
   "metadata": {
    "id": "15dtzw9yrAm7"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# distribution\n",
    "x = np.linspace(-5,5,501)\n",
    "pdf = stats.norm.pdf(x)\n",
    "cdf = stats.norm.cdf(x)\n",
    "\n",
    "\n",
    "_,ax = plt.subplots(1,figsize=(7,3))\n",
    "\n",
    "# patch for the summed area\n",
    "from matplotlib.patches import Polygon\n",
    "bndi = np.argmin(np.abs(x-1))\n",
    "dots = np.zeros((bndi+2,2))\n",
    "for i in range(bndi+1):\n",
    "  dots[i,:] = x[i],pdf[i]\n",
    "dots[-1,:] = x[bndi],0\n",
    "ax.add_patch(Polygon(dots[:,[0,1]],facecolor='k',alpha=.4))\n",
    "\n",
    "# plot the functions\n",
    "ax.plot(x,pdf,'k--',linewidth=2,label='pdf')\n",
    "ax.plot(x,cdf,'k',linewidth=2,label='cdf')\n",
    "ax.axvline(1,color='k',linestyle=':')\n",
    "\n",
    "# make the plot a bit nicer\n",
    "ax.set(xlim=x[[0,-1]],ylim=[0,1.02],xlabel='Data value',ylabel='Probability')\n",
    "plt.legend(loc='upper left')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('prob_pdf2cdf.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "6RYJSQW2rAhk"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "qEAxF8XLrAfD"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 8.7: pdf/cdf combos of some example distributions"
   ],
   "metadata": {
    "id": "VaVwrV9f5GXI"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "\n",
    "_,axs = plt.subplots(1,3,figsize=(10,3))\n",
    "\n",
    "# Gaussian\n",
    "x = np.linspace(-5,5,101)\n",
    "axs[0].plot(x,stats.norm.pdf(x),'k--',linewidth=2)\n",
    "axs[0].plot(x,stats.norm.cdf(x),'k',linewidth=2)\n",
    "axs[0].set_title(r'$\\bf{A}$)  Normal')\n",
    "axs[0].set_xlim(x[[0,-1]])\n",
    "\n",
    "# F\n",
    "x = np.linspace(0,6,101)\n",
    "axs[1].plot(x,stats.f.pdf(x,5,100),'k--',linewidth=2)\n",
    "axs[1].plot(x,stats.f.cdf(x,5,100),'k',linewidth=2)\n",
    "axs[1].set_title(r'$\\bf{B}$)  F')\n",
    "axs[1].set_xlim(x[[0,-1]])\n",
    "\n",
    "# semicircular\n",
    "x = np.linspace(-1.5,1.5,101)\n",
    "axs[2].plot(x,stats.semicircular.pdf(x),'k--',linewidth=2)\n",
    "axs[2].plot(x,stats.semicircular.cdf(x),'k',linewidth=2)\n",
    "axs[2].set_title(r'$\\bf{C}$)  Semicircular')\n",
    "axs[2].set_xlim(x[[0,-1]])\n",
    "\n",
    "# legends\n",
    "for a in axs: a.legend(['pdf','cdf'],frameon=False)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('prob_examplePdfCdf.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "MTe3436BrAcK"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "NqP5tJPVf6RE"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 8.11: Softmax vs. \"raw\" probability"
   ],
   "metadata": {
    "id": "OJVxdyTcAFK4"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "x = [4,5,7]\n",
    "\n",
    "# softmax transformation\n",
    "num = np.exp(x)\n",
    "den = np.sum( np.exp(x) )\n",
    "sigma = num / den\n",
    "\n",
    "print(sigma)"
   ],
   "metadata": {
    "id": "m89gODR9ntJS"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# table data\n",
    "colLabs = ['Raw','Softmax']\n",
    "\n",
    "tabdat = []\n",
    "for xi,si in zip(x,sigma):\n",
    "  tabdat.append([f'{xi:.0f}',f'{si:.3f}'])\n",
    "\n",
    "# draw the table\n",
    "fig, ax = plt.subplots(figsize=(2.7,3))\n",
    "ax.set_axis_off()\n",
    "ht = ax.table(\n",
    "        cellText   = tabdat,\n",
    "        colLabels  = colLabs,\n",
    "        colColours = [(.8,.8,.8)] * len(colLabs),\n",
    "        cellLoc    = 'center',\n",
    "        loc        = 'upper left',\n",
    "        )\n",
    "\n",
    "\n",
    "# some adjustments to the fonts etc\n",
    "ht.scale(1,3.8)\n",
    "ht.auto_set_font_size(False)\n",
    "ht.set_fontsize(14)\n",
    "\n",
    "from matplotlib.font_manager import FontProperties\n",
    "for (row, col), cell in ht.get_celld().items():\n",
    "  cell.set_text_props(fontproperties=FontProperties(family='serif'))\n",
    "  if row==0: cell.set_text_props(fontproperties=FontProperties(weight='bold',size=16))\n",
    "\n",
    "# export\n",
    "plt.tight_layout()\n",
    "plt.savefig('prob_table_softmax.png', bbox_inches='tight')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "ul1QJu0VoD1d"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "oWaoNPJIrWr8"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# the data (marble color counts)\n",
    "counts = np.array([ 40,30,20 ])\n",
    "\n",
    "# softmax\n",
    "num = np.exp(counts)\n",
    "den = np.sum( np.exp(counts) )\n",
    "sigma = num / den\n",
    "\n",
    "# standard probabilities\n",
    "probs = 100*counts / np.sum(counts)\n",
    "\n",
    "# print the results\n",
    "print('Softmax:')\n",
    "print(sigma)\n",
    "\n",
    "print(' ')\n",
    "print('Probabilities:')\n",
    "print(probs)"
   ],
   "metadata": {
    "id": "9UjbRyXHAFFN"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# with raw counts\n",
    "x = np.linspace(0,25,21)\n",
    "s = np.exp(x) / np.sum(np.exp(x))\n",
    "p = x / np.sum(x)\n",
    "\n",
    "plt.plot(x,s,'k^-',markerfacecolor='w',markersize=8,label=f'Softmax (sum={np.sum(s)})')\n",
    "plt.plot(x,p,'ko-',markerfacecolor='w',markersize=8,label=f'Probability (sum={np.sum(p)})')\n",
    "plt.legend()\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "w6jNFGedAFCU"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "-zycU8nL5crp"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 8.10: Softmax in linear and log space"
   ],
   "metadata": {
    "id": "hy3y4TNX5c7F"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# with raw numerical data values\n",
    "x = np.linspace(-6,6,81)\n",
    "s = np.exp(x) / np.sum(np.exp(x))\n",
    "\n",
    "_,axs = plt.subplots(1,2,figsize=(10,4))\n",
    "\n",
    "for a in axs:\n",
    "  a.plot(x,s,'k-',linewidth=2)\n",
    "  a.set(xlabel='Raw data values',ylabel='Softmaxified values',xlim=x[[0,-1]])\n",
    "\n",
    "axs[1].set_yscale('log')\n",
    "axs[0].set_title(r'$\\bf{A}$)  In linear space')\n",
    "axs[1].set_title(r'$\\bf{B}$)  In log space')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('prob_softmaxNumbers.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "zUz8kEEmAE_i"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "vHRAQJFPAE8o"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 1"
   ],
   "metadata": {
    "id": "DLKS8J9Lf6OK"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# re-create the pdf\n",
    "# (note: I'm using \"4\" in the variable names for comparisons in subsequent exercises)\n",
    "x4 = np.linspace(-4,4,400)\n",
    "pdf4 = stats.norm.pdf(x4)\n",
    "\n",
    "# normalize by dx\n",
    "pdf4N = pdf4*(x4[1]-x4[0])\n",
    "\n",
    "# print sums\n",
    "print(f'Sum over pdf: {np.sum(pdf4):.3f}')\n",
    "print(f'Sum over normalized pdf: {np.sum(pdf4N):.3f}')"
   ],
   "metadata": {
    "id": "rpTLweGaf7z2"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "u2E8Q7KP1eVI"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 2"
   ],
   "metadata": {
    "id": "x8igWRFBxzoQ"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# now with a restricted range\n",
    "x2 = np.linspace(-2,2,300)\n",
    "pdf2 = stats.norm.pdf(x2)\n",
    "\n",
    "# normalize by dx\n",
    "pdf2N = pdf2*(x2[1]-x2[0])\n",
    "\n",
    "# normalize to sum=1 (\"U\" is for \"unit\")\n",
    "pdf2U = pdf2 / np.sum(pdf2)\n",
    "\n",
    "# print sums\n",
    "print(f'Sum over pdf normalized by dx : {np.sum(pdf2N):.3f}')\n",
    "print(f'Sum over pdf normalized by sum: {np.sum(pdf2U):.3f}')"
   ],
   "metadata": {
    "id": "0Z3Tqmevf72C"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# plot\n",
    "plt.figure(figsize=(7,4))\n",
    "plt.plot(x4,pdf4N,'k--',linewidth=2,label='x to |4|')\n",
    "plt.plot(x2,pdf2N,':',color=(.2,.2,.2,),linewidth=2,label='Norm by dx')\n",
    "plt.plot(x2,pdf2U,color=(.6,.6,.6),linewidth=2,label='Norm to unit')\n",
    "plt.xlim(x4[[0,-1]])\n",
    "plt.legend()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('prob_ex2.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "KPkwRWGZf74m"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# From the explanations about exercise 1:\n",
    "print(f'Sum over normalized pdf: {np.sum(pdf4N):.9f}')"
   ],
   "metadata": {
    "id": "j5MzkRbs1gaW"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "W7KrnMUff6AK"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 3"
   ],
   "metadata": {
    "id": "mJOlcwQTf59P"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# parameters\n",
    "npnts = [100,1000]\n",
    "colors = ['k','gray']\n",
    "\n",
    "\n",
    "plt.figure(figsize=(8,4))\n",
    "\n",
    "# the loop that does it all\n",
    "for resolution,linecolor in zip(npnts,colors):\n",
    "\n",
    "  # create x-axis grid\n",
    "  x = np.linspace(-4,4,resolution)\n",
    "\n",
    "  # evaluate \"raw\" pdf\n",
    "  pdf = stats.norm.pdf(x)\n",
    "\n",
    "  # normalize by dx\n",
    "  pdfN = pdf*(x[1]-x[0])\n",
    "\n",
    "  # plot the normalized pdf\n",
    "  plt.plot(x,pdfN,'|',linewidth=2,color=linecolor,markersize=4,\n",
    "           label=f'N={resolution}, sum={np.sum(pdfN):.2f}')\n",
    "\n",
    "\n",
    "plt.xlim(x[[0,-1]])\n",
    "plt.legend(bbox_to_anchor=[.55,.8])\n",
    "plt.xlabel('Data value')\n",
    "plt.ylabel('Probability')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('prob_ex3.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "AxL1WQngpn-3"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "5nDiMV-opoBq"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 4"
   ],
   "metadata": {
    "id": "y_xz60Qkr4Ok"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# create cdf from pdf\n",
    "x = np.linspace(-4,4,300)\n",
    "pdf = stats.norm.pdf(x)\n",
    "\n",
    "# python's cdf\n",
    "cdf_sp = stats.norm.cdf(x)\n",
    "\n",
    "# manual computation\n",
    "cdf_my = np.cumsum(pdf)\n",
    "\n",
    "\n",
    "_,axs = plt.subplots(3,1,figsize=(7,6))\n",
    "\n",
    "axs[0].plot(x,pdf,'k',linewidth=2)\n",
    "axs[0].set(xlim=x[[0,-1]])\n",
    "axs[0].set_title(r'$\\bf{A}$)  Gaussian pdf (raw output)')\n",
    "\n",
    "\n",
    "axs[1].plot(x,cdf_my,'k--',linewidth=2,label='cumsum')\n",
    "axs[1].plot(x,cdf_sp,color='gray',linewidth=2,label='scipy')\n",
    "axs[1].set(xlim=x[[0,-1]])\n",
    "axs[1].set_title(r\"$\\bf{B}$)  Gaussian cdfs\")\n",
    "axs[1].legend()\n",
    "\n",
    "\n",
    "# normalized by dx\n",
    "cdf_myN = np.cumsum(pdf) * (x[1]-x[0])\n",
    "\n",
    "axs[2].plot(x,cdf_myN,'k--',linewidth=2,label='cumsum')\n",
    "axs[2].plot(x,cdf_sp,color='gray',linewidth=2,label='scipy')\n",
    "axs[2].set(xlim=x[[0,-1]])\n",
    "axs[2].set_title(r\"$\\bf{C}$)  Gaussian cdfs with normalization\")\n",
    "axs[2].legend()\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('prob_ex4.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "kos1p2zRf7xL"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "N2a1-hBCQGL9"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 5"
   ],
   "metadata": {
    "id": "ZgiBzJIqVJ5p"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# cdfs with different resolutions\n",
    "x = np.linspace(-4,4,5000)\n",
    "pdf = stats.norm.pdf(x)\n",
    "\n",
    "# python's cdf\n",
    "cdf_sp = stats.norm.cdf(x)\n",
    "\n",
    "# manual computation (normalized)\n",
    "cdf_my = np.cumsum(pdf) * (x[1]-x[0])\n",
    "\n",
    "\n",
    "_,axs = plt.subplots(2,1,figsize=(8,5))\n",
    "\n",
    "axs[0].plot(x,cdf_sp,color='gray',linewidth=2,label='scipy')\n",
    "axs[0].plot(x,cdf_my,'k--',linewidth=2,label='cumsum')\n",
    "axs[0].set(xlim=x[[0,-1]])\n",
    "axs[0].set_title(r\"$\\bf{A}$)  High resolution (5000 points)\")\n",
    "axs[0].legend()\n",
    "\n",
    "\n",
    "\n",
    "x = np.linspace(-4,4,20)\n",
    "cdf_sp = stats.norm.cdf(x)\n",
    "cdf_my = np.cumsum(stats.norm.pdf(x)) * (x[1]-x[0])\n",
    "\n",
    "axs[1].plot(x,cdf_sp,'o-',color='gray',linewidth=2,label='scipy')\n",
    "axs[1].plot(x,cdf_my,'ks--',linewidth=2,label='cumsum')\n",
    "axs[1].set(xlim=x[[0,-1]])\n",
    "axs[1].set_title(r\"$\\bf{B}$)  Low resolution (20 points)\")\n",
    "axs[1].legend()\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('prob_ex5.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "PHHDIiwupoEV"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "lOvVGAMBpoHB"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 6"
   ],
   "metadata": {
    "id": "viY8JIF8poJ2"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# the non-standard pdf\n",
    "x = np.linspace(-6,6,1001)\n",
    "pdf = stats.norm.pdf(x-2.7) + stats.norm.pdf(x+2.7)\n",
    "\n",
    "# simple scaling by dx\n",
    "cdf = np.cumsum(pdf) * np.mean(np.diff(x))\n",
    "\n",
    "# better scaling by first unit-sum-normalizing pdf\n",
    "pdfN = pdf / np.sum(pdf)\n",
    "cdfN = np.cumsum(pdfN)\n",
    "\n",
    "# another option: scale cdf to have a final value of 1\n",
    "# cdfN = cdf / cdf[-1]\n",
    "\n",
    "\n",
    "axs = plt.subplots(3,1,figsize=(4,6))[1]\n",
    "\n",
    "axs[0].plot(x,pdf,'k',linewidth=2)\n",
    "axs[0].set(yticks=[0,.3])\n",
    "axs[0].set_title(r'$\\bf{A}$)  pdf')\n",
    "\n",
    "axs[1].plot(x,cdf,'k',linewidth=2)\n",
    "axs[1].axhline(1,color=[.7,.7,.7],linewidth=.7,linestyle='--',zorder=-1)\n",
    "axs[1].set_title(r'$\\bf{B}$)  Improperly scaled cdf')\n",
    "\n",
    "axs[2].plot(x,cdfN,'k',linewidth=2)\n",
    "axs[2].axhline(1,color=[.7,.7,.7],linewidth=.7,linestyle='--',zorder=-1)\n",
    "axs[2].set_title(r'$\\bf{C}$)  Properly scaled cdf')\n",
    "\n",
    "\n",
    "\n",
    "for a in axs:\n",
    "  a.set(xlim=x[[0,-1]])\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('prob_ex6.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "bubFPPKrYvOP"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "0arML1aY40Dw"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 7"
   ],
   "metadata": {
    "id": "B4xuMJeK40Ap"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# create a cdf\n",
    "x = np.linspace(0,10,200)\n",
    "cdf = stats.lognorm.cdf(x,1,1/2)\n",
    "\n",
    "# empirical pdf via difference (D=difference)\n",
    "pdfD = np.diff(cdf)\n",
    "\n",
    "# analytical pdf (A=analytical)\n",
    "pdfA = stats.lognorm.pdf(x,1,1/2)\n",
    "pdfA *= x[1]-x[0]\n",
    "\n",
    "\n",
    "_,axs = plt.subplots(2,1,figsize=(6,5))\n",
    "axs[0].plot(x,cdf,'k',linewidth=2)\n",
    "axs[0].set_title(r'$\\bf{A}$)  cdf of lognormal distribution')\n",
    "\n",
    "axs[1].plot(x,pdfA,'o',color=(.4,.4,.4),markerfacecolor='w',\n",
    "            linewidth=2,label='pdf from function')\n",
    "axs[1].plot(x[:-1],pdfD,'k',linewidth=2,label='pdf from cdf')\n",
    "axs[1].set_title(r'$\\bf{B}$)  pdfs of lognormal distribution')\n",
    "axs[1].legend()\n",
    "\n",
    "for a in axs:\n",
    "  a.set(xlim=x[[0,-1]],xlabel='Data value',ylabel='Prob')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('prob_ex7.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "QgBVIDz-4z9y"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# Note: The two pdf's appear to have some mismatch.\n",
    "#       Try changing the resolution from 200 to 2000, and then to 20."
   ],
   "metadata": {
    "id": "it7U8Mo-pvX5"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "hhdj6WQoYvQk"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 8"
   ],
   "metadata": {
    "id": "iE8bUHLmAEuW"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# colored marble counts\n",
    "blue   = 40\n",
    "yellow = 30\n",
    "orange = 20\n",
    "totalMarbs = blue + yellow + orange\n",
    "\n",
    "# put them all in a jar\n",
    "jar = np.hstack((1*np.ones(blue),2*np.ones(yellow),3*np.ones(orange)))\n",
    "\n",
    "# now we draw 500 marbles (with replacement)\n",
    "numDraws = 500\n",
    "marbSample = np.zeros(numDraws)\n",
    "\n",
    "\n",
    "# I wrote the experiment in a loop\n",
    "# to show how it maps onto a real-world experiment\n",
    "# of data points being gathered one at a time.\n",
    "for drawi in range(numDraws):\n",
    "\n",
    "  # generate a random integer to draw\n",
    "  randmarble = int(np.random.rand()*len(jar))\n",
    "\n",
    "  # store the color of that marble\n",
    "  marbSample[drawi] = jar[randmarble]\n",
    "\n",
    "\n",
    "# but in practice, you can implement it without a loop\n",
    "marbSample = np.random.choice(jar,size=numDraws,replace=True)\n",
    "\n",
    "\n",
    "# now we need to know the proportion of colors drawn\n",
    "propBlue = sum(marbSample==1) / numDraws\n",
    "propYell = sum(marbSample==2) / numDraws\n",
    "propOran = sum(marbSample==3) / numDraws\n",
    "\n",
    "\n",
    "# plot those against the theoretical probability\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.bar([1,2,3],[ propBlue, propYell, propOran ],label='Proportion',color=(.7,.7,.7))\n",
    "plt.plot([0.5, 1.5],[blue/totalMarbs, blue/totalMarbs],'k',linewidth=3,label='Probability')\n",
    "plt.plot([1.5, 2.5],[yellow/totalMarbs,yellow/totalMarbs],'k',linewidth=3)\n",
    "plt.plot([2.5, 3.5],[orange/totalMarbs,orange/totalMarbs],'k',linewidth=3)\n",
    "\n",
    "plt.xticks([1,2,3],labels=('Blue','Yellow','Orange'))\n",
    "plt.xlabel('Marble color')\n",
    "plt.ylabel('Proportion/probability')\n",
    "plt.legend()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('prob_ex8.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "qXlT5jJV0IX7"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "vw8yV6v7phqC"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 9"
   ],
   "metadata": {
    "id": "hScLAtSgphmu"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# colored marble counts\n",
    "blue   = 40\n",
    "yellow = 30\n",
    "orange = 20\n",
    "totalMarbs = blue + yellow + orange\n",
    "\n",
    "\n",
    "\n",
    "# range of sample sizes\n",
    "sampleSizes = np.arange(20,2001,step=10)\n",
    "\n",
    "# some initializations to simplify the code in the experiment\n",
    "empiProbs = np.zeros(3)\n",
    "trueProbs = np.array([ blue/totalMarbs,yellow/totalMarbs,orange/totalMarbs ])\n",
    "\n",
    "# initialize\n",
    "rms = np.zeros(len(sampleSizes))\n",
    "\n",
    "\n",
    "# run the experiment\n",
    "for idx,thisN in enumerate(sampleSizes):\n",
    "\n",
    "  # draw N marbles\n",
    "  drawColors = np.random.choice(jar,size=thisN,replace=True)\n",
    "\n",
    "  # compute proportion\n",
    "  for ei in range(3):\n",
    "    empiProbs[ei] = np.sum(drawColors==(ei+1)) / thisN\n",
    "\n",
    "  # compute the sum of squared errors\n",
    "  rms[idx] = np.sqrt( np.mean( (empiProbs-trueProbs)**2 ) )\n",
    "\n",
    "\n",
    "plt.plot(sampleSizes,rms,'ks',markerfacecolor=[.7,.7,.7])\n",
    "plt.xlim([sampleSizes[0]-30,sampleSizes[-1]+30])\n",
    "plt.xlabel('Sample size')\n",
    "plt.ylabel('RMS')\n",
    "plt.title('Empirical proportion errors',loc='center')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('prob_ex9.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "jegB9WhHpj-X"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "DoWwLuvPphj4"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 10"
   ],
   "metadata": {
    "id": "JgJr5INHK3je"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# generate some data, compute proportions of data above some value\n",
    "\n",
    "# simulation parameters\n",
    "N = 1000 # sample size\n",
    "k = 41   # number of data bins\n",
    "\n",
    "# generage the data\n",
    "data = np.random.randn(N)\n",
    "\n",
    "# determine boundaries (based on a priori knowledge of the distribution)\n",
    "bounds = np.linspace(-3,3,k) # what I call 'zeta' in the text\n",
    "\n",
    "# initialize the results (empirical proportions)\n",
    "emppropsGT = np.zeros(k)\n",
    "emppropsLT = np.zeros(k)\n",
    "empprops2tail = np.zeros(k)\n",
    "\n",
    "\n",
    "# loop over the boundaries\n",
    "for idx,bi in enumerate(bounds):\n",
    "\n",
    "  # empirical proportions for each side separately\n",
    "  emppropsGT[idx] = np.sum(data>bi) / N\n",
    "  emppropsLT[idx] = np.sum(data<bi) / N\n",
    "\n",
    "  # and for the two-sided\n",
    "  empprops2tail[idx] = np.sum(data>np.abs(bi)) / N\n",
    "\n",
    "\n",
    "\n",
    "# visualize\n",
    "_,axs = plt.subplots(1,2,figsize=(10,4))\n",
    "\n",
    "axs[0].plot(bounds,emppropsGT,'ks',markerfacecolor=(.8,.8,.8),markersize=8,label=r'$p(x>\\zeta)$')\n",
    "axs[0].plot(bounds,emppropsLT,'ko',markerfacecolor=(.4,.4,.4),markersize=8,label=r'$p(x<\\zeta)$')\n",
    "axs[0].legend()\n",
    "axs[0].set(xlabel=r'Bound ($\\zeta$)',ylim=[-.05,1.05],ylabel='Proportion',title=r'$\\bf{A}$)  One-sided proportions')\n",
    "\n",
    "axs[1].plot(bounds,empprops2tail,'k^',markerfacecolor=(.8,.8,.8),markersize=8,label=r'$p(x>|\\zeta|)$')\n",
    "axs[1].set(xlabel=r'Bound ($\\zeta$)',ylim=[-.05,1.05],ylabel='Proportion',title=r'$\\bf{B}$)  Two-sided proportion')\n",
    "axs[1].legend()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('prob_ex10.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "4DMLILcqK3g2"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "## Note about the above exercise:\n",
    "# I encourage you to try different distributions for generating data, but\n",
    "# you will need to adjust the proportion bounds; -3 to +3 is hard-coded to\n",
    "# be appropriate for a normal distribution.\n",
    "#\n",
    "# Consider this to be an additional way to challenge yourself :)"
   ],
   "metadata": {
    "id": "4RGEch1uVDWs"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "YWGNGDkxphbw"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 11"
   ],
   "metadata": {
    "id": "ztmOyQblphhD"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "### Here's the url:\n",
    "# https://docs.scipy.org/doc/scipy/reference/stats.html#continuous-distributions\n",
    "\n",
    "# All you need to do is copy the code for Figure 8.7 but replace \"norm\" with, e.g., \"crystalball.\"\n",
    "# I recommend removing the code to create the gray patches.\n"
   ],
   "metadata": {
    "id": "JTcYJoVJpheY"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "DePBZBt-K3Qh"
   },
   "execution_count": null,
   "outputs": []
  }
 ]
}